This workbook loads the lifebook data and creates a sample subpopulation of the data to be analysed in the rest of the project.
lifebook_tbl <- read_csv("data/dataset.csv", progress = FALSE) %>%
mutate_if(is.character, as.factor)
## Parsed with column specification:
## cols(
## .default = col_character(),
## prem_freq = col_integer(),
## prem_ape = col_double(),
## prem_risk = col_double(),
## policy_startdate = col_date(format = ""),
## policy_enddate = col_date(format = ""),
## policy_duration = col_integer(),
## mort_rating = col_double(),
## sum_assured = col_double(),
## dob_life1 = col_date(format = ""),
## isjointlife = col_logical(),
## islifeonly = col_logical(),
## policy_statuschangedate = col_date(format = ""),
## lapsed = col_logical(),
## subpop = col_logical()
## )
## See spec(...) for full column specifications.
glimpse(lifebook_tbl)
## Observations: 1,500,000
## Variables: 26
## $ policy_id <fctr> C010000056, C010000063, C010000106, C010000141, C010...
## $ countyname <fctr> Offaly County, Fingal, Cork County, Dublin City, Sou...
## $ edname <fctr> Ballyburly, Castleknock-Knockmaroon, Cobh Rural, Cab...
## $ nuts3name <fctr> Midland, Dublin, South-West (IE), Dublin, Dublin, Du...
## $ sa_id <fctr> A187004004, A267040024, A047106023, A268030019, A267...
## $ cluster_id <fctr> n6_c5, n6_c4, n6_c5, n6_c0, n6_c0, n6_c4, n6_c2, n6_...
## $ prod_type <fctr> protection, pension, pension, protection, savings, p...
## $ prem_type <fctr> RP, SP, RP, RP, SP, RP, RP, RP, RP, RP, RP, RP, RP, ...
## $ prem_freq <int> 12, NA, 12, 12, NA, 12, 12, 12, 4, 12, 12, 12, 12, 12...
## $ prem_ape <dbl> 303.27, 1249.95, 1183.20, 981.40, 600.00, 2289.99, 37...
## $ prem_risk <dbl> 204.372, NA, NA, 738.679, NA, NA, NA, NA, NA, NA, NA,...
## $ policy_startdate <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-02, 1990...
## $ policy_enddate <date> 1995-01-02, 2063-03-24, 2071-07-15, 2010-01-02, 2000...
## $ policy_duration <int> 5, NA, NA, 20, 10, NA, 20, NA, 20, 10, NA, 20, 10, 20...
## $ mort_rating <dbl> 150, NA, NA, 100, NA, NA, NA, NA, NA, NA, NA, 100, 20...
## $ sum_assured <dbl> 200000, NA, NA, 250000, NA, NA, NA, NA, NA, NA, NA, 4...
## $ dob_life1 <date> 1962-04-29, 1943-03-24, 1951-07-15, 1955-05-26, 1950...
## $ gender_life1 <fctr> F, M, M, M, M, M, F, F, M, M, M, M, M, M, M, M, M, F...
## $ smoker_life1 <fctr> Q, Q, N, N, S, N, N, Q, Q, N, N, N, S, S, N, S, Q, Q...
## $ isjointlife <lgl> FALSE, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, FAL...
## $ islifeonly <lgl> TRUE, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, TRUE...
## $ mortgage_status <fctr> TERM, NA, NA, MORTDECR, NA, NA, NA, NA, NA, NA, NA, ...
## $ policy_status <fctr> lapsed, lapsed, lapsed, lapsed, lapsed, lapsed, laps...
## $ policy_statuschangedate <date> 1991-03-02, 1998-12-02, 1997-09-02, 1990-07-02, 1998...
## $ lapsed <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,...
## $ subpop <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
Having loaded the data, we now separate the data into four categories: logical, numeric, categorical, and date/time.
drop_var <- c('policy_id', 'edname', 'sa_id')
subpop_var <- 'subpop'
subpop_vals <- lifebook_tbl[[subpop_var]] == TRUE
subpop_tbl <- lifebook_tbl %>% filter(subpop_vals)
datatypes_tbl <- lifebook_tbl %>%
select(-one_of(c(subpop_var, drop_var))) %>%
summarise_each(funs(class)) %>%
gather('variable','datatype')
var_types <- datatypes_tbl %>%
.[['datatype']] %>%
unique
gen_list <- lapply(var_types
,function(x) datatypes_tbl %>% filter(datatype == x) %>% .[['variable']])
names(gen_list) <- var_types
type_list <- list(
categorical = c(gen_list$factor, gen_list$logical)
,numeric = c(gen_list$numeric, gen_list$integer)
,datetime = gen_list$Date
)
We first create some simple plots based on the indicator.
ggplot(lifebook_tbl) +
geom_bar(aes(x = subpop)) +
scale_y_continuous(labels = comma) +
ylab("Count")
facet_formula <- formula(paste0("~", subpop_var))
for(plot_var in type_list$categorical) {
cat(paste0("Plot Var: ", plot_var, "\n"))
var_plot <- ggplot(lifebook_tbl) +
geom_bar(aes_string(x = plot_var)) +
facet_wrap(facet_formula, scales = 'free') +
scale_y_continuous(labels = comma) +
xlab(plot_var) +
ylab("Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
print(var_plot)
}
## Plot Var: countyname
## Plot Var: nuts3name
## Plot Var: cluster_id
## Plot Var: prod_type
## Plot Var: prem_type
## Plot Var: gender_life1
## Plot Var: smoker_life1
## Plot Var: mortgage_status
## Plot Var: policy_status
## Plot Var: isjointlife
## Plot Var: islifeonly
## Plot Var: lapsed
for(plot_var in type_list$numeric) {
cat(paste0("Plot Var: ", plot_var, "\n"))
var_plot <- ggplot(lifebook_tbl) +
geom_histogram(aes_string(x = plot_var), bins = 50) +
facet_wrap(facet_formula, scales = 'free') +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
xlab(plot_var) +
ylab("Count")
print(var_plot)
}
## Plot Var: prem_ape
## Plot Var: prem_risk
## Warning: Removed 727307 rows containing non-finite values (stat_bin).
## Plot Var: mort_rating
## Warning: Removed 727307 rows containing non-finite values (stat_bin).
## Plot Var: sum_assured
## Warning: Removed 727307 rows containing non-finite values (stat_bin).
## Plot Var: prem_freq
## Warning: Removed 233199 rows containing non-finite values (stat_bin).
## Plot Var: policy_duration
## Warning: Removed 475193 rows containing non-finite values (stat_bin).
for(plot_var in type_list$datetime) {
cat(paste0("Plot Var: ", plot_var, "\n"))
var_plot <- ggplot(lifebook_tbl) +
geom_histogram(aes_string(x = plot_var), bins = 50) +
facet_wrap(facet_formula, scales = 'free') +
scale_x_date(date_labels = '%Y-%m-%d') +
scale_y_continuous(labels = comma) +
xlab(plot_var) +
ylab("Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
print(var_plot)
}
## Plot Var: policy_startdate
## Plot Var: policy_enddate
## Plot Var: dob_life1
## Plot Var: policy_statuschangedate
This approach compares the data in the subpopulation against a number of bootstrap samples from the full dataset, and then plots the two against each other.
bootstrap_count <- 250
calc_bootstrap_stats <- function(x, b) {
if(missing(b)) b <- seq_along(x)
use_x <- x[b]
samp_mean <- mean(use_x)
samp_perc <- quantile(use_x
,type = 1
,probs = c(0.50, 0.01, 0.10, 0.25, 0.75, 0.90, 0.99))
return(c(mean = samp_mean, samp_perc))
}
generate_bootstrap_props <- function(data_tbl, cat_varname, count) {
bs_tbl <- data_tbl %>%
sample_n(count) %>%
group_by_(cat_varname) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count))
return(bs_tbl)
}
Dealing with the bootstrap for categorical variables is not obvious to me.
for(plot_var in type_list$categorical) {
subpop_prop_tbl <- subpop_tbl %>%
group_by_(plot_var) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count)
,idx = 1) %>%
arrange_(plot_var)
catprop_lst <- list()
for(i in 1:bootstrap_count) {
catprop_lst[[i]] <- lifebook_tbl %>%
generate_bootstrap_props(plot_var, subpop_tbl %>% nrow) %>%
mutate(idx = i)
}
bootprops_tbl <- catprop_lst %>% bind_rows()
rm(catprop_lst)
cat_plot <- ggplot(bootprops_tbl) +
geom_line(aes_string(x = plot_var, y = 'prop', group = 'idx')
,alpha = 0.1) +
geom_line(aes_string(x = plot_var, y = 'prop', group = 1)
,data = subpop_prop_tbl
,colour = 'red') +
expand_limits(y = 0) +
xlab(plot_var) +
ylab("Proportion") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
print(cat_plot)
}
We take bootstrap samples of the full dataset, calculate the statistics and compare them to the corresponding statistic in the subpopulation.
for(plot_var in type_list$numeric) {
cat(paste0("Plot Var: ", plot_var, "\n"))
var_vals <- lifebook_tbl %>% .[[plot_var]]
var_vals <- var_vals[!is.na(var_vals)]
subpop_stats_tbl <- var_vals %>%
calc_bootstrap_stats %>%
t %>%
as_data_frame %>%
gather('variable','value')
data_boot <- boot(var_vals, calc_bootstrap_stats, R = bootstrap_count)
boot_tbl <- data_boot$t %>% as_data_frame
names(boot_tbl) <- subpop_stats_tbl$variable
bootplot_tbl <- boot_tbl %>%
mutate(iter = 1:n()) %>%
gather('variable','value',-iter)
var_plot <- ggplot(bootplot_tbl) +
geom_density(aes(x = value)) +
geom_vline(aes(xintercept = value), data = subpop_stats_tbl, colour = 'red') +
facet_wrap(~variable, scales = 'free') +
scale_x_continuous(labels = comma) +
xlab(plot_var)
print(var_plot)
}
## Plot Var: prem_ape
## Plot Var: prem_risk
## Plot Var: mort_rating
## Plot Var: sum_assured
## Plot Var: prem_freq
## Plot Var: policy_duration
for(plot_var in type_list$datetime) {
cat(paste0("Plot Var: ", plot_var, "\n"))
var_vals <- lifebook_tbl %>% .[[plot_var]]
var_vals <- var_vals[!is.na(var_vals)]
stat_vals <- var_vals %>% calc_bootstrap_stats
subpop_stats_tbl <- data_frame(variable = names(stat_vals)
,value = stat_vals)
data_boot <- boot(var_vals, calc_bootstrap_stats, R = bootstrap_count)
boot_tbl <- data_boot$t %>% as_data_frame
names(boot_tbl) <- subpop_stats_tbl$variable
bootplot_tbl <- boot_tbl %>%
mutate(iter = 1:n()) %>%
gather('variable','value',-iter) %>%
mutate(value = as.Date(value, origin = '1970-01-01'))
var_plot <- ggplot(bootplot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
geom_vline(aes(xintercept = as.numeric(value)), data = subpop_stats_tbl, colour = 'red') +
facet_wrap(~variable, scales = 'free') +
scale_x_date(date_labels = '%Y-%m-%d') +
xlab(plot_var) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
print(var_plot)
}
## Plot Var: policy_startdate
## Plot Var: policy_enddate
## Plot Var: dob_life1
## Plot Var: policy_statuschangedate